import sys
from sklearn.linear_model import LogisticRegression
import pandas as pd
from utils import get_dataset
from tqdm import tqdm
from models.LRBinsModel import LRBinsModel
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle

datasetname = sys.argv[1]
data = pd.read_csv(f"data/{datasetname}.csv")
hyperparameters = {}

paperlracc = []
paperlrrocauc = []
paperlrwbinsacc = []
paperlrwbinsrocauc = []
paperxgbacc = []
paperxgbrocauc = []

num_runs = 1
for seed in range(num_runs):
    X_train, X_val, X_test, y_train, y_val, y_test, feature_names = get_dataset(
        data, normalize=True, random_state=seed
    )

    num_features = X_train.shape[1]

    best_roc_score = 0
    for i in tqdm(range(9, 2, -1)):
        for j in [500, 100]:
            depth_clf = XGBClassifier(max_depth=i, n_estimators = j)
            depth_clf.fit(X_train, y_train)
            y_probs = depth_clf.predict_proba(X_test)[:, 1]
            y_preds = depth_clf.predict(X_test)
            roc_score = roc_auc_score(y_test, y_probs)
            acc_score = accuracy_score(y_test, y_preds)
            if(best_roc_score<roc_score):
                best_roc_score = roc_score
                best_xgb_i = i
                best_xgb_j = j
    
    hyperparameters["xgb_max_depth"] = best_xgb_i
    hyperparameters["xgb_n_estimators"] = best_xgb_j

    clf = XGBClassifier(max_depth=best_xgb_i, n_estimators = best_xgb_j)
    clf.fit(X_train, y_train)
    y_probs = clf.predict_proba(X_test)[:, 1]
    y_preds = clf.predict(X_test)
    y_test = y_test.astype(int)
    y_preds = y_preds.astype(int)
    roc_score = roc_auc_score(y_test, y_probs)
    acc_score = accuracy_score(y_test, y_preds)
    curpaperxgbrocauc = roc_score
    paperxgbrocauc.append(roc_score)
    curpaperxgbacc = acc_score
    paperxgbacc.append(acc_score)

    best_i = -1
    best_j = -1
    best_rocauc = 0
    for i in tqdm(range(2, min(7, num_features))):
        for j in [min(30, num_features)]:
            model = LRBinsModel(n_bin_features=i, n_inference_features=j)
            model.fit(X_train, y_train)
            results = model.performance(X_test, y_test)
            if(results["rocauc"]>best_rocauc):
                best_rocauc = results["rocauc"]
                best_acc = results["accuracy"]
                best_i = i
                best_j = j

    hyperparameters["lrbins_n_bin_features"] = best_i
    hyperparameters["lrbins_n_inference_features"] = best_j

    paperlrwbinsrocauc.append(best_rocauc)
    paperlrwbinsacc.append(best_acc)

    important_features_X_train = model.get_important_features(
        X_train, model.feature_importances, model.n_inference_features
    )
    important_features_X_test = model.get_important_features(
        X_test, model.feature_importances, model.n_inference_features
    )
    lr_clf = LogisticRegression()
    lr_clf.fit(important_features_X_train, y_train)
    y_probs = lr_clf.predict_proba(important_features_X_test)[:, 1]
    y_preds = lr_clf.predict(important_features_X_test)
    y_test = y_test.astype(int)
    y_preds = y_preds.astype(int)
    lr_roc_score = roc_auc_score(y_test, y_probs)
    lr_acc_score = accuracy_score(y_test, y_preds)
    paperlrrocauc.append(lr_roc_score)
    paperlracc.append(lr_acc_score)

with open(f"hyperparameters/{datasetname}.p", "wb") as fp:
    pickle.dump(hyperparameters, fp)